% *************************************************************************
% 
%                      iGem Modelling - 2018
%                     Delft University of Technology
% 
% *************************************************************************
% 
%               Title
% 
%                    File: gRNA array
%                           
% 
%              
% *************************************************************************


clear; clc;

% input gene cds - for example first 30 bp of EPO gene
Gene='ATGGGGGTGCACGAATGTCCTGCCTGGCTGTGGCTTCTCCTGTCCCTGCTGTCGCTCCCTCTGGGCCTCCCAGTCCTGGGCGCCCCACCACGCCTCATCTGTGACAGCCGAGTCCTGGAGAGGTACCTCTTGGAGGCCAAGGAGGCCGAGAATATCACGACGGGCTGTGCTGAACATTGCAGCTTGAATGAGAATATCACTGTCCCAGACACCAAAGTTAATTTCTATGCCTGGAAGAGGATGGAGGTCGGGCAGCAGGCCGTAGAAGTCTGGCAGGGCCTGGCCCTGCTGTCGGAAGCTGTCCTGCGGGGCCAGGCCCTGTTGGTCAACTCTTCCCAGCCGTGGGAGCCCCTGCAACTGCATGTGGATAAAGCCGTCAGTGGCCTTCGCAGCCTCACCACTCTGCTTCGGGCTCTGGGAGCCCAGAAGGAAGCCATCTCCCCTCCAGATGCGGCCTCAGCTGCTCCACTCCGAACAATCACTGCTGACACTTTCCGCAAACTCTTCCGAGTCTACTCCAATTTCCTCCGGGGAAAGCTGAAGCTGTACACAGGGGAGGCGTGCAGGACAGGGGACAGATGA';
Junctions=[12, 159,246,426];  % Possition in bp for junctions between exons

% Features of Cas you are working with
Cas_PAM='G';         % PAM sequence for Cas
Cas_PAM_rev='C';     % PAM sequence for Cas, reverse strand
N=1;                 % Number of N in PAM behind Cas_PAM (eg. NG has N=1)
gRNA_size=20;        % Size of gRNA for Cas
Cas_on_target=10;   % Number of bp that have to be identical next to PAM


% Transform genetic sequence to numerical vector
% A=1; T=2; C=3; G=4;
s='ATCGN';
e=['1';'2';'3';'4';'0'];

for i=1:numel(s)
    Gene=regexprep(Gene,s(i),e(i,:));
    Cas_PAM=regexprep(Cas_PAM,s(i),e(i,:));
    Cas_PAM_rev=regexprep(Cas_PAM_rev,s(i),e(i,:));
end

Gene_num=str2double(regexp(num2str(Gene),'\d','match'));
Cas_PAM_num=str2double(regexp(num2str(Cas_PAM),'\d','match'));
Cas_PAM_rev_num=str2double(regexp(num2str(Cas_PAM_rev),'\d','match'));

% Obtain aa chain and possible codons for each position
[Gene_aa, Poss_cod] = protein_chain(Gene_num);


% Search possible PAM sequences +/- 20 bp from junction. Then find possible
% gRNAs.


% Search gRNAs with PAM in strand 5' to 3' (always 5'PAM3')
gRNA_position=[];  % Position in gene for gRNA
gRNA_possible=[];  % How many codons in this gRNA
b=1;
for i=1:size(Junctions,2)
    for j=-20:20
        if Junctions(i)+j>gRNA_size+3    % If the junction is to close to start codon, then gRNA will not be possible
            if Gene_num(Junctions(i)+j)==Cas_PAM_num
                gRNA_position(b)=Junctions(i)+j;
                gRNA_possible(b)=prod(Poss_cod(floor((Junctions(i)+j-N-Cas_on_target)/3):ceil((Junctions(i)+j-2)/3)));
                b=b+1;
            end
        end
    end
end


% Search gRNAs with PAM in strand 3' to 5' (always 5' PAM 3')
gRNA_position_rev=[];  % Position in gene for gRNA
gRNA_possible_rev=[];  % How many codons in this gRNA
b=1;
for i=1:size(Junctions,2)
    for j=-20:20
        if Junctions(i)+j>0    % If the junction is to close to start codon.
            if Gene_num(Junctions(i)+j)==Cas_PAM_rev_num
                gRNA_position_rev(b)=Junctions(i)+j;
                gRNA_possible_rev(b)=prod(Poss_cod(ceil((Junctions(i)+j+(N+1))/3):ceil((Junctions(i)+j+2+Cas_on_target)/3)));
                b=b+1;
            end
        end
    end
end

% Put together position and possible guides.
gRNA=[gRNA_position, gRNA_position_rev; gRNA_possible, gRNA_possible_rev];
min_gRNA=min(gRNA(2,:));        % Find the smaller number of variations for gRNAs
min_gRNA_poss=find(gRNA(2,:)==min_gRNA);
min_gRNA_Geneposs=gRNA(1,find(gRNA(2,:)==min_gRNA));    % Find position in Gene_num that has this PAM sequence with small variation of gRNAs.



% Amino acid table (codons for aa) in bp (numbers as in initial loop)
A=[ 1 2 2 1 2 3 1 2 1 0 0 0 0 0 0 0 0 0;
    3 2 2 3 2 3 3 2 1 3 2 4 2 2 1 2 2 4;
    4 2 2 4 2 3 4 2 1 4 2 4 0 0 0 0 0 0;
    2 2 2 2 2 3 0 0 0 0 0 0 0 0 0 0 0 0;
    1 2 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
    2 4 2 2 4 3 0 0 0 0 0 0 0 0 0 0 0 0;
    4 3 2 4 3 3 4 3 1 4 3 4 0 0 0 0 0 0;
    4 4 2 4 4 3 4 4 1 4 4 4 0 0 0 0 0 0;
    3 3 2 3 3 3 3 3 1 3 3 4 0 0 0 0 0 0;
    1 3 2 1 3 3 1 3 1 1 3 4 0 0 0 0 0 0;
    2 3 2 2 3 3 2 3 1 2 3 4 1 4 2 1 4 3;
    2 1 2 2 1 3 0 0 0 0 0 0 0 0 0 0 0 0;
    2 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
    3 1 1 3 1 4 0 0 0 0 0 0 0 0 0 0 0 0;
    1 1 2 1 1 3 0 0 0 0 0 0 0 0 0 0 0 0;
    3 1 2 3 1 3 0 0 0 0 0 0 0 0 0 0 0 0;
    4 1 1 4 1 4 0 0 0 0 0 0 0 0 0 0 0 0;
    4 1 2 4 1 3 0 0 0 0 0 0 0 0 0 0 0 0;
    1 1 1 1 1 4 0 0 0 0 0 0 0 0 0 0 0 0;
    3 4 2 3 4 3 3 4 1 3 4 4 1 4 1 1 4 4];

% How many codons for each aminoacid (same number for each aa).
aa_cod=[3 6 4 2 1 2 4 4 4 4 6 2 1 2 2 2 2 2 2 6]';
for i=1:length(aa_cod)
    aa_cod(i,1:aa_cod(i))=1:aa_cod(i);
end



% Export the gRNA sequences
gRNA_array=[];
for i=1:size(min_gRNA_Geneposs,2)   % How many positions with small possibilities
    
    %______________________________
    if min_gRNA_poss(i) > size(gRNA_possible,2)  % This would be PAM on reverse strands
        
        
        if mod(min_gRNA_Geneposs(i),3)==0           % If PAM is divisible by 3: Final part of a codon.
            jj=ceil((Cas_on_target+1)/3);               % To take codons
            aa_sequence=Gene_aa(ceil(((min_gRNA_Geneposs(i)/3)+1):(min_gRNA_Geneposs(i)/3)+jj));  % aminoacids that can change
            
            % Fill a vector called Comb that contains the possible combinations of the aa.
            Comb=[];
            for ii=1:length(aa_sequence)
                Comb(ii,:)=aa_cod(aa_sequence(ii),:);
            end
            
            % Take to function "all_variations" that will generate all possible combinations of codons for the peptide chain.
            combs= all_variations(Comb,ii);
            
            % Transform all the codon variations into base pairs (in numbers)
            Sequence=aa_to_bpNum(combs,aa_sequence,A);
            
            % Add PAM sequence
            Sequence=[Gene_num(min_gRNA_Geneposs(i)).*ones(size(Sequence,1),1),Sequence(:,1),zeros(size(Sequence,1),1),Sequence(:,2:end)];
            Sequence=Sequence(:,1:Cas_on_target+3);
            
            
        elseif mod(min_gRNA_Geneposs(i)-1,3)==0     % IF PAM starts a new codon
            
            jj=ceil((Cas_on_target-1)/3);               % To take codons
            aa_sequence=Gene_aa(ceil(((min_gRNA_Geneposs(i)/3)+1):(min_gRNA_Geneposs(i)/3)+jj));  % aminoacids that can change
            
            % Fill a vector called Comb that contains the possible combinations of the aa.
            Comb=[];
            for ii=1:length(aa_sequence)
                Comb(ii,:)=aa_cod(aa_sequence(ii),:);
            end
            
            % Take to function "all_variations" that will generate all possible combinations of codons for the peptide chain.
            combs= all_variations(Comb,ii);
            
            % Transform all the codon variations into base pairs (in numbers)
            Sequence=aa_to_bpNum(combs,aa_sequence,A);
            
            % Add PAM sequence
            Sequence=[Gene_num(min_gRNA_Geneposs(i)).*ones(size(Sequence,1),1),Gene_num(min_gRNA_Geneposs(i)+1).*ones(size(Sequence,1),1),zeros(size(Sequence,1),1),Gene_num(min_gRNA_Geneposs(i)+2).*ones(size(Sequence,1),1),Sequence];
            Sequence=Sequence(:,1:Cas_on_target+3);
            
        elseif mod(min_gRNA_Geneposs(i)+1,3)==0    % If PAM is in the middle of a codon
            Sequence=[];
            jj=ceil((Cas_on_target)/3);               % To take codons
            aa_sequence=Gene_aa(ceil(((min_gRNA_Geneposs(i)/3)+1):(min_gRNA_Geneposs(i)/3)+jj));  % aminoacids that can change
            
            % Fill a vector called Comb that contains the possible combinations of the aa.
            Comb=[];
            for ii=1:length(aa_sequence)
                Comb(ii,:)=aa_cod(aa_sequence(ii),:);
            end
            
            % Take to function "all_variations" that will generate all possible combinations of codons for the peptide chain.
            combs= all_variations(Comb,ii);
            
            % Transform all the codon variations into base pairs (in numbers)
            Sequence=aa_to_bpNum(combs,aa_sequence,A);
            
            % Add PAM sequence
            Sequence=[Gene_num(min_gRNA_Geneposs(i)).*ones(size(Sequence,1),1),Gene_num(min_gRNA_Geneposs(i)+1).*ones(size(Sequence,1),1),zeros(size(Sequence,1),1),Sequence];
            Sequence=Sequence(:,1:Cas_on_target+3);
            
        end
        
        
        
    else            % PAM on forward strand
        
        
        if mod(min_gRNA_Geneposs(i),3)==0           % If PAM is divisible by 3: Final part of a codon.
            jj=ceil((Cas_on_target-1)/3);               % To take codons
            aa_sequence=Gene_aa(ceil(((min_gRNA_Geneposs(i)/3)-jj):(min_gRNA_Geneposs(i)/3)-1));  % aminoacids that can change
            
            % Fill a vector called Comb that contains the possible combinations of the aa.
            Comb=[];
            for ii=1:length(aa_sequence)
                Comb(ii,:)=aa_cod(aa_sequence(ii),:);
            end
            
            % Take to function "all_variations" that will generate all possible combinations of codons for the peptide chain.
            combs= all_variations(Comb,ii);
            
            % Transform all the codon variations into base pairs (in numbers)
            Sequence=aa_to_bpNum(combs,aa_sequence,A);
            
            % Add PAM sequence codon and limit to gRNA size.
            Sequence=[Sequence, Gene_num(min_gRNA_Geneposs(i)-2).*ones(size(Sequence,1),1),zeros(size(Sequence,1),1),Gene_num(min_gRNA_Geneposs(i)-1).*ones(size(Sequence,1),1),Gene_num(min_gRNA_Geneposs(i)).*ones(size(Sequence,1),1)];
            Sequence=Sequence(:,end-(Cas_on_target+2):end);
            
            
        elseif mod(min_gRNA_Geneposs(i)-1,3)==0    % If PAM starts a new codon
            jj=ceil((Cas_on_target+1)/3);     % To take codons
            
            aa_sequence=Gene_aa(ceil(((min_gRNA_Geneposs(i)/3)-jj):(min_gRNA_Geneposs(i)/3)-1));  % aminoacids that can change
            
            % Fill a vector called Comb that contains the possible combinations of the aa.
            Comb=[];
            for ii=1:length(aa_sequence)
                Comb(ii,:)=aa_cod(aa_sequence(ii),:);
            end
            
            % Take to function "all_variations" that will generate all possible combinations of codons for the peptide chain.
            combs= all_variations(Comb,ii);
            
            % Transform all the codon variations into base pairs (in numbers)
            Sequence=aa_to_bpNum(combs,aa_sequence,A);
            
            % Add PAM sequence codon and limit to gRNA size.
            Sequence=[Sequence(:,1:end-1),zeros(size(Sequence,1),1),Sequence(:,end),Gene_num(min_gRNA_Geneposs(i)).*ones(size(Sequence,1),1)];
            Sequence=Sequence(:,end-(Cas_on_target+2):end);
            
            
        elseif mod(min_gRNA_Geneposs(i)+1,3)==0    % If PAM is in the middle of a codon
            jj=ceil(Cas_on_target/3);     % To take codons
            
            aa_sequence=Gene_aa(ceil(((min_gRNA_Geneposs(i)/3)-jj):((min_gRNA_Geneposs(i))/3)-1));
            
            % Fill a vector called Comb that contains the possible combinations of the aa.
            Comb=[];
            for ii=1:length(aa_sequence)
                Comb(ii,:)=aa_cod(aa_sequence(ii),:);
            end
            
            % Take to function "all_variations" that will generate all possible combinations of codons for the peptide chain.
            combs= all_variations(Comb,ii);
            
            % Transform all the codon variations into base pairs (in numbers)
            Sequence=aa_to_bpNum(combs,aa_sequence,A);
            
            % Add PAM sequence codon and limit to gRNA size.
            Sequence=[Sequence,zeros(size(Sequence,1),1),Gene_num(min_gRNA_Geneposs(i)-1).*ones(size(Sequence,1),1),Gene_num(min_gRNA_Geneposs(i)).*ones(size(Sequence,1),1)];
            Sequence=Sequence(:,end-(Cas_on_target+2):end);
            
        end
        
    end
    %Sequence=[];   % THIS MUST BE ELIMINATED WHEN ALL FUNCTIONS ARE DONE.
    gRNA_array=[gRNA_array; zeros(1,size(Sequence,2)); Sequence];
    
end

% Concvert numerical Sequence into bp sequence with function 'num_to_base'
gRNA_array = num_to_base(gRNA_array);

% If you want to save the gRNA sequence on Excel, then uncomment this:
%filename='gRNA array EPO';
%xlswrite(filename,Sequence)


disp('The following list contains all the gRNAs necessary')
disp('These gRNAs have PAM sequences at positions:')
disp({num2str(min_gRNA_Geneposs),'bp of the gene'})
gRNA_array






%_________________________________________________________________________
% Functions used


% Function to obtain amino acid sequence and number of possible codons.
function [Gene_aa, Poss_cod] = protein_chain(Gene_num)

for i=1:size(Gene_num,2)/3
    if Gene_num(-2+3*i:3*i)==[1 2 2] | Gene_num(-2+3*i:3*i)==[1 2 3] | Gene_num(-2+3*i:3*i)==[1 2 1]
        Gene_aa(i)=1; % Isoleucine
        Poss_cod(i)=3;
    elseif Gene_num(-2+3*i:3*i)==[3 2 2] | Gene_num(-2+3*i:3*i)==[3 2 3] | Gene_num(-2+3*i:3*i)==[3 2 1] | Gene_num(-2+3*i:3*i)==[3 2 4]
        Gene_aa(i)=2; % Leucine
        Poss_cod(i)=6;
    elseif Gene_num(-2+3*i:3*i)==[2 2 1] | Gene_num(-2+3*i:3*i)==[2 2 4]
        Gene_aa(i)=2; % Leucine
        Poss_cod(i)=6;
    elseif Gene_num(-2+3*i:3*i)==[4 2 2] | Gene_num(-2+3*i:3*i)==[4 2 3] | Gene_num(-2+3*i:3*i)==[4 2 1] | Gene_num(-2+3*i:3*i)==[4 2 4]
        Gene_aa(i)=3; % Valine
        Poss_cod(i)=4;
    elseif Gene_num(-2+3*i:3*i)==[2 2 2] | Gene_num(-2+3*i:3*i)==[2 2 3]
        Gene_aa(i)=4; % Phenylalanine
        Poss_cod(i)=2;
    elseif Gene_num(-2+3*i:3*i)==[1 2 4]
        Gene_aa(i)=5; % Methionine
        Poss_cod(i)=1;
    elseif Gene_num(-2+3*i:3*i)==[2 4 2] | Gene_num(-2+3*i:3*i)==[2 4 3]
        Gene_aa(i)=6; % Cysteine
        Poss_cod(i)=2;
    elseif Gene_num(-2+3*i:3*i)==[4 3 2] | Gene_num(-2+3*i:3*i)==[4 3 3] | Gene_num(-2+3*i:3*i)==[4 3 1] | Gene_num(-2+3*i:3*i)==[4 3 4]
        Gene_aa(i)=7; % Alanine
        Poss_cod(i)=4;
    elseif Gene_num(-2+3*i:3*i)==[4 4 2] | Gene_num(-2+3*i:3*i)==[4 4 3] | Gene_num(-2+3*i:3*i)==[4 4 1] | Gene_num(-2+3*i:3*i)==[4 4 4]
        Gene_aa(i)=8; % Glycine
        Poss_cod(i)=4;
    elseif Gene_num(-2+3*i:3*i)==[3 3 2] | Gene_num(-2+3*i:3*i)==[3 3 3] | Gene_num(-2+3*i:3*i)==[3 3 1] | Gene_num(-2+3*i:3*i)==[3 3 4]
        Gene_aa(i)=9; % Proline
        Poss_cod(i)=4;
    elseif Gene_num(-2+3*i:3*i)==[1 3 2] | Gene_num(-2+3*i:3*i)==[1 3 3] | Gene_num(-2+3*i:3*i)==[1 3 1] | Gene_num(-2+3*i:3*i)==[1 3 4]
        Gene_aa(i)=10; % Threonine
        Poss_cod(i)=4;
    elseif Gene_num(-2+3*i:3*i)==[2 3 2] | Gene_num(-2+3*i:3*i)==[2 3 3] | Gene_num(-2+3*i:3*i)==[2 3 1] | Gene_num(-2+3*i:3*i)==[2 3 4]
        Gene_aa(i)=11; % Serine
        Poss_cod(i)=6;
    elseif Gene_num(-2+3*i:3*i)==[1 4 2] | Gene_num(-2+3*i:3*i)==[1 4 3]
        Gene_aa(i)=11; % Serine
        Poss_cod(i)=6;
    elseif Gene_num(-2+3*i:3*i)==[2 1 2] | Gene_num(-2+3*i:3*i)==[2 1 3]
        Gene_aa(i)=12; % Tyrosine
        Poss_cod(i)=2;
    elseif Gene_num(-2+3*i:3*i)==[2 4 4]
        Gene_aa(i)=13; % Tryptophan
        Poss_cod(i)=1;
    elseif Gene_num(-2+3*i:3*i)==[3 1 1] | Gene_num(-2+3*i:3*i)==[3 1 4]
        Gene_aa(i)=14; % Glutamine
        Poss_cod(i)=2;
    elseif Gene_num(-2+3*i:3*i)==[1 1 2] | Gene_num(-2+3*i:3*i)==[1 1 3]
        Gene_aa(i)=15; % Aspargine
        Poss_cod(i)=2;
    elseif Gene_num(-2+3*i:3*i)==[3 1 2] | Gene_num(-2+3*i:3*i)==[3 1 3]
        Gene_aa(i)=16; % Histidine
        Poss_cod(i)=2;
    elseif Gene_num(-2+3*i:3*i)==[4 1 1] | Gene_num(-2+3*i:3*i)==[4 1 4]
        Gene_aa(i)=17; % Glutamic acid
        Poss_cod(i)=2;
    elseif Gene_num(-2+3*i:3*i)==[4 1 2] | Gene_num(-2+3*i:3*i)==[4 1 3]
        Gene_aa(i)=18; % Aspartic acid
        Poss_cod(i)=2;
    elseif Gene_num(-2+3*i:3*i)==[1 1 1] | Gene_num(-2+3*i:3*i)==[1 1 4]
        Gene_aa(i)=19; % Lysine
        Poss_cod(i)=2;
    elseif Gene_num(-2+3*i:3*i)==[3 4 2] | Gene_num(-2+3*i:3*i)==[3 4 3] | Gene_num(-2+3*i:3*i)==[3 4 1] | Gene_num(-2+3*i:3*i)==[3 4 4]
        Gene_aa(i)=20; % Arganine
        Poss_cod(i)=6;
    elseif Gene_num(-2+3*i:3*i)==[1 4 1] | Gene_num(-2+3*i:3*i)==[1 4 4]
        Gene_aa(i)=20; % Arganine
        Poss_cod(i)=6;
    elseif Gene_num(-2+3*i:3*i)==[2 1 1] | Gene_num(-2+3*i:3*i)==[2 1 4] | Gene_num(-2+3*i:3*i)==[2 4 1]
        Gene_aa(i)=0; % STOP
        Poss_cod(i)=3;
    end
end

end


% Functions to generate array for function 'variations'
function output = all_variations(Combs,ii)
    if ii==2
        output = variations(Combs(1,:),Combs(2,:));
    elseif ii==3
        output = variations(Combs(1,:),Combs(2,:),Combs(3,:));
    elseif ii==4
        output = variations(Combs(1,:),Combs(2,:),Combs(3,:),Combs(4,:));
    elseif ii==5
        output = variations(Combs(1,:),Combs(2,:),Combs(3,:),Combs(4,:),Combs(5,:));
    elseif ii==6
        output = variations(Combs(1,:),Combs(2,:),Combs(3,:),Combs(4,:),Combs(5,:),Combs(6,:));
    elseif ii==7
        output = variations(Combs(1,:),Combs(2,:),Combs(3,:),Combs(4,:),Combs(5,:),Combs(6,:),Combs(7,:));
    elseif ii==8
        output = variations(Combs(1,:),Combs(2,:),Combs(3,:),Combs(4,:),Combs(5,:),Combs(6,:),Combs(7,:),Combs(8,:));
    elseif ii==9
        output = variations(Combs(1,:),Combs(2,:),Combs(3,:),Combs(4,:),Combs(5,:),Combs(6,:),Combs(7,:),Combs(8,:),Combs(9,:));
    elseif ii==10
        output = variations(Combs(1,:),Combs(2,:),Combs(3,:),Combs(4,:),Combs(5,:),Combs(6,:),Combs(7,:),Combs(8,:),Combs(9,:),Combs(10,:));
    end
end

% Function that generates all combinarions of aminoacids
function output = variations(varargin)
    % empty array to recover every result
    tmp = cell(1,nargin);
    % Save results as cells in a temporal array
    [tmp{:}] = ndgrid(varargin{:});
    % Access each cell, change to column and save in output
    output=[];
    for i = 1:nargin
        output = [output, tmp{i}(:)];
    end
    
    % remove zeros output
    for i=1:size(output,2)
        Zeros=[];
        Zeros=find(output(:,i)==0); output(Zeros,:)=[];
    end
    
end

% Function that generates all combinarions of aminoacids
function Sequence = aa_to_bpNum(combs,aa_sequence,A)
    Sequence=zeros(size(combs,1),length(aa_sequence)*3);
    for aa=1:size(Sequence,1)
        for bb=3:3:size(Sequence,2)
            Sequence(aa,((1*bb)-2:1*bb))=A(aa_sequence(bb/3),(((combs(aa,(bb/3)))*3)-2):(combs(aa,(bb/3))*3));
        end
    end
end 

function output = num_to_base(Sequence)

    a=find(Sequence==1); Sequence(a)=65; a=[];
    a=find(Sequence==2); Sequence(a)=84; a=[];
    a=find(Sequence==3); Sequence(a)=67; a=[];
    a=find(Sequence==4); Sequence(a)=71; a=[];
    output=char(Sequence);

end
